Introduction

Check out this Kaggle

Business Goals

  1. Predict the number of adults for the resort and city hotels for the next 4 weeks
    • Note: one booking can be for multiple adults, so aggregate accordingly
  2. Predict average daily rate (adr) prices based on other features

Ideas

  1. fb’s prophet package
  2. random forest, elastic net

Get Data

a = read_csv('hotel_bookings.csv') %>%
  clean_names() %>% 
  mutate(across(where(is.character), factor)) %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is.numeric)
  ) %>% filter(is_canceled == 0) #filter to non-canceled bookings

a$is_canceled = NULL
a$reservation_status_date = NULL

Resources

  1. adr
  2. transient
  3. group rate
  4. Guide to Hotel Management
  5. distribution channels

5 min EDA

a %>% head

Not sure exactly what ‘reservation_status_date’ refers to; for a date field, will create an ‘arrival.date’ col to use in its place

#I don't really understand this var; will create an 'arrival.date' var for a date field
a$reservation_status_date = NULL

skimr::skim(a)
Data summary
Name a
Number of rows 75166
Number of columns 30
_______________________
Column type frequency:
factor 13
numeric 17
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
agent 0 1 FALSE 315 9: 18697, NUL: 12310, 240: 8438, 7: 3065
arrival_date_month 0 1 FALSE 12 Aug: 8638, Jul: 7919, May: 7114, Oct: 6914
assigned_room_type 0 1 FALSE 10 A: 41105, D: 18960, E: 5838, F: 2824
company 0 1 FALSE 332 NUL: 69560, 40: 850, 223: 665, 45: 222
country 0 1 FALSE 166 PRT: 21071, GBR: 9676, FRA: 8481, ESP: 6391
customer_type 0 1 FALSE 4 Tra: 53099, Tra: 18735, Con: 2814, Gro: 518
deposit_type 0 1 FALSE 3 No : 74947, Ref: 126, Non: 93
distribution_channel 0 1 FALSE 5 TA/: 57718, Dir: 12088, Cor: 5203, GDS: 156
hotel 0 1 FALSE 2 Cit: 46228, Res: 28938
market_segment 0 1 FALSE 7 Onl: 35738, Off: 15908, Dir: 10672, Gro: 7714
meal 0 1 FALSE 5 BB: 57800, HB: 9479, SC: 6684, Und: 883
reservation_status 0 1 FALSE 1 Che: 75166, Can: 0, No-: 0
reserved_room_type 0 1 FALSE 9 A: 52364, D: 13099, E: 4621, F: 2017

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
adr 0 1 99.99 49.21 -6.38 67.5 92.5 125 510 <U+2587><U+2586><U+2581><U+2581><U+2581>
adults 0 1 1.83 0.51 0.00 2.0 2.0 2 4 <U+2581><U+2582><U+2587><U+2581><U+2581>
arrival_date_day_of_month 0 1 15.84 8.78 1.00 8.0 16.0 23 31 <U+2587><U+2587><U+2587><U+2587><U+2586>
arrival_date_week_number 0 1 27.08 13.90 1.00 16.0 28.0 38 53 <U+2586><U+2587><U+2587><U+2587><U+2586>
arrival_date_year 0 1 2016.15 0.70 2015.00 2016.0 2016.0 2017 2017 <U+2583><U+2581><U+2587><U+2581><U+2586>
babies 0 1 0.01 0.11 0.00 0.0 0.0 0 10 <U+2587><U+2581><U+2581><U+2581><U+2581>
booking_changes 0 1 0.29 0.74 0.00 0.0 0.0 0 21 <U+2587><U+2581><U+2581><U+2581><U+2581>
children 0 1 0.10 0.39 0.00 0.0 0.0 0 3 <U+2587><U+2581><U+2581><U+2581><U+2581>
days_in_waiting_list 0 1 1.59 14.78 0.00 0.0 0.0 0 379 <U+2587><U+2581><U+2581><U+2581><U+2581>
is_repeated_guest 0 1 0.04 0.20 0.00 0.0 0.0 0 1 <U+2587><U+2581><U+2581><U+2581><U+2581>
lead_time 0 1 79.98 91.11 0.00 9.0 45.0 124 737 <U+2587><U+2582><U+2581><U+2581><U+2581>
previous_bookings_not_canceled 0 1 0.20 1.81 0.00 0.0 0.0 0 72 <U+2587><U+2581><U+2581><U+2581><U+2581>
previous_cancellations 0 1 0.02 0.27 0.00 0.0 0.0 0 13 <U+2587><U+2581><U+2581><U+2581><U+2581>
required_car_parking_spaces 0 1 0.10 0.30 0.00 0.0 0.0 0 8 <U+2587><U+2581><U+2581><U+2581><U+2581>
stays_in_week_nights 0 1 2.46 1.92 0.00 1.0 2.0 3 50 <U+2587><U+2581><U+2581><U+2581><U+2581>
stays_in_weekend_nights 0 1 0.93 0.99 0.00 0.0 1.0 2 19 <U+2587><U+2581><U+2581><U+2581><U+2581>
total_of_special_requests 0 1 0.71 0.83 0.00 0.0 1.0 1 5 <U+2587><U+2581><U+2581><U+2581><U+2581>

clean/encode data

# make arrival date var
a = a %>% mutate(
  arrival.date = make_date(
    year = arrival_date_year,
    month = match(arrival_date_month, month.name),
    day = arrival_date_day_of_month)
  ) %>% arrange(arrival.date)

### will use this for later on
a.anomaly = a

# these numeric vars s/b factor vars
a = a %>% mutate_at(vars(arrival_date_day_of_month, arrival_date_week_number, arrival_date_year, is_repeated_guest), factor)

# reordering df
a = a %>% select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is.numeric)
  )

You’ll have to repeat the steps above for the test data when preprocessing with recipes and step_XXX’

EDA: time series

range

paste(
  'The date range of this dataset is from',
  a %>% pull(arrival.date) %>% range %>% .[1],
  'to',
  a %>% pull(arrival.date) %>% range %>% .[2],
  ', just over 3 years of data.'
)
## [1] "The date range of this dataset is from 2015-07-01 to 2017-08-31 , just over 3 years of data."

time series graph

a %>% group_by(arrival.date) %>% 
  summarise(total.individuals = sum(adults, children, babies)) %>% 
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.individuals
  ) %>% layout(
    title = 'total.individuals by date',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

time series graph – grouped (hotel)

a %>% group_by(arrival.date, hotel) %>% 
  summarise(total.individuals = sum(adults, children, babies)) %>% 
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.individuals,
    color = ~hotel,
    alphtrain = 0.7
  )  %>% layout(
    title = 'total.individuals by date/hotel',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    ) 

time series graph – grouped (customer_type)

a %>% group_by(arrival.date, customer_type) %>% 
  summarise(total.individuals = sum(adults, children, babies)) %>%  
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.individuals,
    color = ~customer_type,
    alphtrain = 0.7
  ) %>% layout(
    title = 'total.individuals by date/customer_type',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

time series graph – grouped (deposit_type)

a %>% group_by(arrival.date, deposit_type) %>% 
  summarise(total.individuals = sum(adults, children, babies)) %>%   
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.individuals,
    color = ~deposit_type,
    alphtrain = 0.7
  ) %>% layout(
    title = 'total.individuals by date/deposit_type',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

time series graph – grouped (distribution_channel)

a %>% group_by(arrival.date, distribution_channel) %>% 
  summarise(total.individuals = sum(adults, children, babies)) %>%  
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.individuals,
    color = ~distribution_channel,
    alphtrain = 0.7
  ) %>% layout(
    title = 'total.individuals by date/distribution_channel',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

EDA: nom vars

sample data

a %>% select(where(is.factor)) %>% slice_sample(n = 10)

glimpse structure

a %>% select(where(is.factor)) %>% glimpse
## Rows: 75,166
## Columns: 17
## $ agent                     <fct> NULL, NULL, NULL, 304, 240, 240, NULL, 30...
## $ arrival_date_day_of_month <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ arrival_date_month        <fct> July, July, July, July, July, July, July,...
## $ arrival_date_week_number  <fct> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2...
## $ arrival_date_year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015,...
## $ assigned_room_type        <fct> C, C, C, A, A, A, C, C, D, E, G, E, E, E,...
## $ company                   <fct> NULL, NULL, NULL, NULL, NULL, NULL, NULL,...
## $ country                   <fct> PRT, PRT, GBR, GBR, GBR, GBR, PRT, PRT, P...
## $ customer_type             <fct> Transient, Transient, Transient, Transien...
## $ deposit_type              <fct> No Deposit, No Deposit, No Deposit, No De...
## $ distribution_channel      <fct> Direct, Direct, Direct, Corporate, TA/TO,...
## $ hotel                     <fct> Resort Hotel, Resort Hotel, Resort Hotel,...
## $ is_repeated_guest         <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ market_segment            <fct> Direct, Direct, Direct, Corporate, Online...
## $ meal                      <fct> BB, BB, BB, BB, BB, BB, BB, FB, HB, BB, H...
## $ reservation_status        <fct> Check-Out, Check-Out, Check-Out, Check-Ou...
## $ reserved_room_type        <fct> C, C, A, A, A, A, C, C, D, D, G, E, D, E,...

check missing values

a %>% select(where(is.factor)) %>% miss_var_summary

distribution of level counts per factor

jpal = colorRampPalette(brewer.pal(8,'Dark2'))(15)

a %>% select(where(is.factor)) %>%
  map(n_unique) %>%
  as.tibble() %>%
  pivot_longer(everything()) %>%
  plot_ly(y = ~name, x = ~value, color = ~name, colors = jpal) %>%
  add_bars() %>%
  hide_legend() %>% 
  layout(
    title = 'distribution of level counts per factor',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

reference: names of unique levels

a %>% select(where(is.factor)) %>%
  map(unique)
## $agent
##   [1] NULL 304  240  303  241  8    250  115  5    253  6    156  175  134  243 
##  [16] 242  69   1    3    105  40   11   147  142  196  306  184  96   2    127 
##  [31] 128  95   143  13   7    146  27   177  26   244  149  22   15   261  17  
##  [46] 167  300  305  67   19   152  171  104  36   28   9    71   110  14   42  
##  [61] 258  20   181  66   45   114  34   29   37   57   61   16   88   10   39  
##  [76] 21   24   251  275  301  50   30   68   4    70   72   75   245  54   81  
##  [91] 248  89   208  183  74   52   86   201  12   44   92   256  31   314  83  
## [106] 94   273  82   281  126  193  63   32   90   79   99   85   117  112  87  
## [121] 185  47   106  98   111  330  324  334  328  326  182  321  332  313  38  
## [136] 339  119  336  78   135  148  151  155  348  350  138  56   91   103  352 
## [151] 121  158  168  335  308  118  159  139  195  180  355  315  363  144  153 
## [166] 211  210  360  187  375  129  213  174  327  220  387  270  173  367  216 
## [181] 154  298  384  232  64   35   23   58   385  393  406  205  157  249  405 
## [196] 133  150  214  192  132  191  163  267  414  333  215  252  227  247  427 
## [211] 278  219  307  430  431  429  420  280  426  285  433  289  269  438  436 
## [226] 418  265  441  295  294  223  432  282  288  446  290  122  325  450  341 
## [241] 434  310  454  234  455  344  77   59   368  451  346  468  464  359  283 
## [256] 364  370  33   358  371  411  25   179  53   481  141  378  391  254  397 
## [271] 331  469  165  416  404  299  73   467  197  510  354  440  444  296  337 
## [286] 476  526  493  394  425  461  408  390  527  388  502  453  479  410  262 
## [301] 508  459  229  474  475  423  480  484  497  535  302  531  495  509  449 
## 334 Levels: 1 10 103 104 105 106 107 11 110 111 112 114 115 117 118 119 ... NULL
## 
## $arrival_date_day_of_month
##  [1] 1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31
## 31 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 31
## 
## $arrival_date_month
##  [1] July      August    September October   November  December  January  
##  [8] February  March     April     May       June     
## 12 Levels: April August December February January July June March ... September
## 
## $arrival_date_week_number
##  [1] 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [26] 52 53 1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [51] 24 25 26
## 53 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 53
## 
## $arrival_date_year
## [1] 2015 2016 2017
## Levels: 2015 2016 2017
## 
## $assigned_room_type
##  [1] C A D E G F I B H K
## Levels: A B C D E F G H I K L P
## 
## $company
##   [1] NULL 110  204  270  154  135  113  42   240  20   83   9    40   45   46  
##  [16] 103  38   144  108  47   49   307  268  51   62   67   82   200  195  68  
##  [31] 59   76   72   86   246  84   312  65   91   318  81   28   96   287  93  
##  [46] 174  319  274  100  31   223  286  159  292  281  92   116  115  101  105 
##  [61] 323  317  329  94   118  325  43   331  88   11   324  53   278  12   297 
##  [76] 137  139  80   16   337  142  338  127  130  342  120  148  343  346  347 
##  [91] 107  140  349  143  149  351  289  61   291  160  353  355  290  163  8   
## [106] 54   85   99   250  269  358  362  192  361  14   366  372  277  365  109 
## [121] 207  178  377  179  378  379  183  380  22   364  360  371  232  197  384 
## [136] 203  167  185  219  217  388  212  209  390  180  391  215  376  193  392 
## [151] 221  165  153  213  224  186  396  302  398  225  237  6    216  222  263 
## [166] 370  399  230  169  367  234  397  35   400  369  401  402  403  227  251 
## [181] 282  405  158  146  168  245  218  104  382  255  78   408  258  254  409 
## [196] 259  260  413  238  407  34   10   272  257  106  271  18   419  275  415 
## [211] 273  421  424  210  425  423  428  422  71   284  395  439  435  442  301 
## [226] 233  305  445  293  264  311  288  304  308  313  320  448  150  443  314 
## [241] 454  334  332  444  394  333  456  52   459  341  458  39   465  460  447 
## [256] 352  470  356  243  466  73   368  457  482  350  484  184  485  330  32  
## [271] 383  487  491  490  494  112  496  393  499  132  29   501  220  412  411 
## [286] 420  504  410  511  426  507  417  514  506  498  242  515  516  512  126 
## [301] 64   429  477  433  518  446  450  523  521  418  528  530  437  280  357 
## [316] 479  478  483  534  436  489  229  486  481  539  525  497  541  520  455 
## [331] 451  492 
## 353 Levels: 10 100 101 102 103 104 105 106 107 108 109 11 110 112 113 ... NULL
## 
## $country
##   [1] PRT  GBR  USA  ESP  IRL  FRA  NULL ROU  NOR  OMN  ARG  DEU  POL  BEL  BRA 
##  [16] ITA  CHE  CN   GRC  NLD  DNK  RUS  SWE  AUS  EST  CZE  FIN  AUT  HUN  ISR 
##  [31] MOZ  BWA  NZL  LUX  IDN  SVN  ALB  MAR  CHN  HRV  AGO  BGR  IND  DZA  MEX 
##  [46] TUN  COL  KAZ  LVA  STP  UKR  VEN  TWN  SMR  KOR  TUR  BLR  JPN  PRI  SRB 
##  [61] LTU  CPV  AZE  LBN  CRI  CHL  THA  IRN  SVK  EGY  CMR  LIE  MYS  SAU  ZAF 
##  [76] MKD  MMR  DOM  IRQ  SGP  CYM  ZMB  PAN  ZWE  SEN  NGA  GIB  ARM  PER  KNA 
##  [91] JOR  KWT  LKA  GEO  TMP  ETH  MUS  ECU  PHL  CUB  ARE  BFA  CYP  KEN  BIH 
## [106] COM  SUR  JAM  MCO  GNB  RWA  LBY  UGA  TZA  CIV  SYR  MLI  BGD  ISL  BHR 
## [121] NAM  BOL  BDI  MLT  URY  PAK  SYC  QAT  PRY  BRB  ABW  AND  VNM  AIA  SLV 
## [136] PLW  DMA  GAB  CAF  PYF  LCA  GUY  ATA  MDV  MWI  MNE  GTM  MDG  GHA  ASM 
## [151] TGO  UZB  NPL  MRT  BHS  HKG  NCL  MAC  KIR  SDN  ATF  TJK  DJI  SLE  LAO 
## [166] FRO 
## 178 Levels: ABW AGO AIA ALB AND ARE ARG ARM ASM ATA ATF AUS AUT AZE BDI ... ZWE
## 
## $customer_type
## [1] Transient       Contract        Transient-Party Group          
## Levels: Contract Group Transient Transient-Party
## 
## $deposit_type
## [1] No Deposit Refundable Non Refund
## Levels: No Deposit Non Refund Refundable
## 
## $distribution_channel
## [1] Direct    Corporate TA/TO     Undefined GDS      
## Levels: Corporate Direct GDS TA/TO Undefined
## 
## $hotel
## [1] Resort Hotel City Hotel  
## Levels: City Hotel Resort Hotel
## 
## $is_repeated_guest
## [1] 0 1
## Levels: 0 1
## 
## $market_segment
## [1] Direct        Corporate     Online TA     Offline TA/TO Groups       
## [6] Complementary Aviation     
## 8 Levels: Aviation Complementary Corporate Direct Groups ... Undefined
## 
## $meal
## [1] BB        FB        HB        SC        Undefined
## Levels: BB FB HB SC Undefined
## 
## $reservation_status
## [1] Check-Out
## Levels: Canceled Check-Out No-Show
## 
## $reserved_room_type
## [1] C A D G E F H L B
## Levels: A B C D E F G H L P
a = a %>% mutate(arrival_date_month = factor(arrival_date_month, levels = c('January','February','March','April','May','June','July','August','September','October','November','December')))

EDA: num vars

check missing values

a %>% select(where(is.numeric)) %>% miss_var_summary

sample data

a %>% select(where(is.numeric)) %>% slice_sample(n = 10)

glimpse structure

a %>% select(where(is.numeric)) %>% glimpse
## Rows: 75,166
## Columns: 13
## $ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98....
## $ adults                         <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ booking_changes                <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ children                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ lead_time                      <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 35, 6...
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ stays_in_week_nights           <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, ...
## $ stays_in_weekend_nights        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_of_special_requests      <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 0, 3, 1, 0, ...

viz: outliers

a %>% select(where(is.numeric)) %>% dlookr::plot_outlier()

There are many upper outliers. When building a prediction model, perhaps we should consider transforming them or removing them entirely.

check right tailness

jquantiles = function(col){quantile(col, probs = c(0.90, 0.95, 0.99, 1))}

a %>% na.omit %>% select(where(is.numeric)) %>%
  map(.x = . , jquantiles) %>%
  as.data.frame.list() %>%
  rownames_to_column() %>%
  as.tibble()

viz: normality

a %>% select(where(is.numeric)) %>% dlookr::plot_normality()
## Warning in log(x): NaNs produced
## Warning in sqrt(x): NaNs produced

viz: distribution histogram (with outliers)

#with outliers
a %>% select(where(is.numeric)) %>% DataExplorer::plot_histogram(nrow = 2, ncol = 1)

viz: distribution histogram (without most extreme outliers)

#no extreme right tailed outliers
a %>% select(where(is.numeric)) %>% filter(
  adr != 510,
  adults != 55,
  babies != 10,
  booking_changes != 21,
  children != 10,
  days_in_waiting_list != 391,
  lead_time != 709,
  previous_bookings_not_canceled != 72,
  previous_cancellations != 26,
  required_car_parking_spaces != 8,
  stays_in_week_nights != 50,
  stays_in_weekend_nights != 19
) %>% DataExplorer::plot_histogram(nrow = 2, ncol = 1)

viz: distribution bivariate (without extreme outliers)

#no outliers
a %>% select(hotel, where(is.numeric)) %>% filter(
  adr != 510,
  adults != 55,
  babies != 10,
  booking_changes != 21,
  children != 10,
  days_in_waiting_list != 391,
  lead_time != 709,
  previous_bookings_not_canceled != 72,
  previous_cancellations != 26,
  required_car_parking_spaces != 8,
  stays_in_week_nights != 50,
  stays_in_weekend_nights != 19
) %>% DataExplorer::plot_boxplot(by = 'hotel', nrow = 3, ncol = 1)

EDA: ADR further investigation

CAUTION: You should calculate ADR per adult since the ds is at the booking level not the individual level. Also you should remove bookings with children

#check: check number of bookings by combination of adult and children counts
a %>% count(adults, children)
a %>% count(adults, children) %>% filter(adults == 0) %>% summarise(total.bookings.without.adults = sum(n))
a %>% count(adults, children) %>% filter(children == 0) %>% summarise(total.bookings.without.children = sum(n))
a %>% count(adults, children) %>% filter(children == 0, adults == 0) %>% summarise(total.bookings.without.adults.and.children = sum(n))

There are very few bookings without adults, but it’s good to filter out to get the most accurate results.’

percent of ds with only adult booking data

paste(
  scales::percent(
    nrow(a %>% filter(adults > 0, children == 0)) / nrow(a)
    ),
  'of all observations have 1 or more adults and zero children.'
)
## [1] "93% of all observations have 1 or more adults and zero children."
a.adult.adrs = a %>% filter(adults > 0, children == 0)

distribution ADR per child

a %>% filter(adults == 0, children >0) %>% mutate(adr.per.child = adr/children) %>% plot_ly(x = ~adr.per.child)  %>% add_boxplot() %>% layout('ADR distribution per child')

Median ADR per child is $43.54

create adr.per.adult var

#creating adr per adult var
a.adult.adrs = a.adult.adrs %>% mutate(adr.per.adult = adr/adults)

a.adult.adrs %>%
  plot_ly(y = ~hotel, x = ~adr.per.adult, color = ~hotel, colors = jpal[1:2]) %>% 
  add_boxplot() %>% 
  layout(
    title = 'ADR per Adult by Hotel Type',
    xaxis = list(title = ''),
    yaxis = list(title = '')
  )
#https://stackoverflow.com/questions/57300053/split-a-plotly-boxplot-x-axis-by-group
a.adult.adrs %>%
  plot_ly(y = ~hotel, x = ~adr.per.adult, color = ~customer_type, colors = jpal, group = ~customer_type) %>% 
  add_boxplot() %>% 
  layout(
    boxmode = 'group', #SUPER IMPORTANT
    title = 'ADR per Adult by Hotel/customer_type'
    ) 
#https://stackoverflow.com/questions/57300053/split-a-plotly-boxplot-x-axis-by-group
a.adult.adrs %>%
  plot_ly(y = ~hotel, x = ~adr, color = ~market_segment, colors = jpal, group = ~market_segment) %>% 
  add_boxplot() %>% 
  layout(
    boxmode = 'group', #SUPER IMPORTANT
    title = 'ADR per Adult by Hotel/market_segment'
    ) 
#https://stackoverflow.com/questions/57300053/split-a-plotly-boxplot-x-axis-by-group
a.adult.adrs %>%
  plot_ly(x = ~hotel, y = ~adr, color = ~arrival_date_month, colors = jpal, group = ~arrival_date_month) %>% 
  add_boxplot() %>% 
  layout(
    boxmode = 'group', #SUPER IMPORTANT
    title = 'ADR per Adult by Hotel/arrival_date_month',
    hoverformat = '.0f'
    ) 
library(DescTools)
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:data.table':
## 
##     %like%

Observations

  1. On average, the Resort Hotel ($40) is cheaper than the City Hotel ($54)
  2. Not surprisingly, ‘Direct’ market segment customers pay more, while Group and Corporate customers pay less
  3. Not surprisingly, ADR rates are higher for the Resort Hotel in summer months

correlations: viz

a %>% select(where(is.numeric)) %>% dlookr::plot_correlate()

a %>% select(where(is.numeric)) %>% GGally::ggcorr(palette = "RdBu", label = TRUE)

EDA: viz: Paired XXXvariate

a %>% select(
  hotel,
  customer_type
) %>% GGally::ggpairs(
  mapping = aes(color = hotel)
  ) + scale_fill_manual(values = c('darkblue','darkorange'))

Goal 1: Predict Number of Adults by Hotel Type for the next 4 weeks

Aggregating at the weekly level to smooth out daily fluctuations

creating time series dataset

creating a separate time series ds for anomaly detection and predictions b/c time series ds needs to be split chronologically via ‘initial_time_split’, not randomly via ‘initial_split’

create ts dataset, filter, and split

#times series dataset
ts =  a %>% na.omit %>% filter(
  adults > 0,
  children == 0
) %>% mutate(
  arrival.date = lubridate::make_date(
      year = arrival_date_year,
      month = match(arrival_date_month, month.name),
      day = arrival_date_day_of_month
  )) %>% mutate(
    arrival.week = floor_date(arrival.date, 'week', week_start = 7)
  ) %>% group_by(arrival.week, hotel) %>% 
  summarise(total.adult.bookings = sum(adults), .groups = 'drop') %>% 
  arrange(arrival.week)
  
#---------------------------- city

ts.city = ts %>% filter(hotel == 'City Hotel')
ts.city$hotel = NULL
ts.city.split = initial_time_split(ts.city)
ts.city.a = training(ts.city.split)
ts.city.test = testing(ts.city.split)

#---------------------------- resort

ts.resort = ts %>% filter(hotel == 'Resort Hotel')
ts.resort$hotel = NULL
ts.resort.split = initial_time_split(ts.resort)
ts.resort.a = training(ts.resort.split)
ts.resort.test = testing(ts.resort.split)

Anomaly Detection

library(anomalize)
# time_decompose(data, target, method = c("stl", "twitter"), frequency = "auto", trend = "auto", ..., merge = FALSE, message = TRUE)
# anomalize(data, target, method = c("iqr", "gesd"), alpha = 0.05, max_anoms = 0.2, verbose = FALSE)
# The alpha parameter adjusts the width of the critical values. By default, alpha = 0.05.
# Lower values are more conservative while higher values are less prone to incorrectly classifying "normal" observations.
# max_anoms: The maximum percent of anomalies permitted to be identified.

# The STL method uses the stl() function from the stats package. STL works very well in circumstances where a long term trend is present (which applies in this case; see trend component in the prophet graphs below'). 
  
(ts.city.a %>% as.tibble() %>% 
  time_decompose(total.adult.bookings, method = 'stl', merge = TRUE) %>%
  anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
  time_recompose())
ggplotly(
  ts.city.a %>% as.tibble() %>% 
  time_decompose(total.adult.bookings, method = 'stl', merge = TRUE) %>%
  anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
  time_recompose() %>% 
    plot_anomalies(
      ncol = 2,
      alpha_dots = 0.5,
      alpha_circles = 0.5,
      size_circles = 2,
      time_recomposed = TRUE,
      alpha_ribbon = 0.05
      ) + scale_y_continuous(labels = comma) +
    labs(x = '', y = 'total.bookings', title = 'city hotel total.adult.bookings')
  )
#----------------------------

(ts.resort.a %>% as.tibble() %>% 
  time_decompose(total.adult.bookings, method = 'stl', merge = TRUE) %>%
  anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
  time_recompose())
ggplotly(
  ts.resort.a %>% as.tibble() %>% 
  time_decompose(total.adult.bookings, method = 'stl', merge = TRUE) %>%
  anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
  time_recompose() %>% 
    plot_anomalies(
      ncol = 2,
      alpha_dots = 0.5,
      alpha_circles = 0.5,
      size_circles = 2,
      time_recomposed = TRUE,
      alpha_ribbon = 0.05
      ) + scale_y_continuous(labels = comma) +
    labs(x = '', y = 'total.bookings', title = 'resort hotel total adults')
  )

Predicting Next 4 Wks Total Adults by Hotel Type

City Hotel

library(prophet)
## Loading required package: Rcpp
## 
## Attaching package: 'Rcpp'
## The following object is masked from 'package:rsample':
## 
##     populate
## Loading required package: rlang
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice
## The following object is masked from 'package:data.table':
## 
##     :=
# https://dygraphs.com/options.html

#renaming cols to prophet's col conventions
prophet.city.df = ts.city.a %>% select(ds = arrival.week, y = total.adult.bookings)

#creating model
prophet.city.mdl = prophet.city.df %>% prophet(yearly.seasonality = TRUE)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
#using model make future period df
prophet.city.future.df = prophet.city.mdl %>% make_future_dataframe(
  periods = 4, #next 4 wks
  freq = 'week',
  include_history = TRUE
  )

#make forecasts df
prophet.city.forecast.df = prophet.city.mdl %>% predict(prophet.city.future.df)

prophet.city.forecast.df %>% head %>% DT::datatable()
#plot forecast
prophet.city.mdl %>% dyplot.prophet(
  prophet.city.forecast.df,
  main = '<h style="color: black; font-size:18px;">City Hotel: Total Adults 4 Week Prediction</h>'
  ) %>%
  dygraphs::dyOptions(
    colors = c('darkgreen','blue'),
    pointSize = 2,
    )
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#plot forecast components
prophet.city.mdl %>% prophet_plot_components(prophet.city.forecast.df)

measure model performance - city

(prophet.city.preds.truth = tibble(
  arrival.date = ts.city.test %>% pull(arrival.week) %>% head(4),
  truth =  ts.city.test %>% pull(total.adult.bookings) %>% head(4),
  estimate = prophet.city.forecast.df %>% pull(yhat) %>% tail(4)
))
bind_rows(
prophet.city.preds.truth %>% yardstick::rmse(truth = truth, estimate = estimate),
prophet.city.preds.truth %>% yardstick::mae(truth = truth, estimate = estimate)
)

On average, our weekly predictions are off by 89 adults

Resort Hotel

library(prophet)
# https://dygraphs.com/options.html

#renaming cols to prophet's col conventions
prophet.resort.df = ts.resort.a %>% select(ds = arrival.week, y = total.adult.bookings)

#creating model
prophet.resort.mdl = prophet.resort.df %>% prophet(yearly.seasonality = TRUE)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
#using model make future period df
prophet.resort.future.df = prophet.resort.mdl %>% make_future_dataframe(
  periods = 4, #next 4 wks
  freq = 'week',
  include_history = TRUE
  )

#make forecasts df
prophet.resort.forecast.df = prophet.resort.mdl %>% predict(prophet.resort.future.df)

prophet.resort.forecast.df %>% head %>% DT::datatable()
#plot forecast
prophet.resort.mdl %>% dyplot.prophet(
  prophet.resort.forecast.df,
  main = '<h style="color: black; font-size:18px;">Resort Hotel: Total Adults 4 Week Prediction</h>'
  ) %>%
  dygraphs::dyOptions(
    colors = c('darkorange','blue'),
    pointSize = 2,
    )
#plot forecast components
prophet.resort.mdl %>% prophet_plot_components(prophet.resort.forecast.df)

(prophet.resort.preds.truth = tibble(
  arrival.date = ts.resort.test %>% pull(arrival.week) %>% head(4),
  truth =  ts.resort.test %>% pull(total.adult.bookings) %>% head(4),
  estimate = prophet.resort.forecast.df %>% pull(yhat) %>% tail(4)
))
bind_rows(
prophet.resort.preds.truth %>% yardstick::rmse(truth = truth, estimate = estimate),
prophet.resort.preds.truth %>% yardstick::mae(truth = truth, estimate = estimate)
)

On average, our weekly predictions are off by 78 adults

Goal 2: Predict ADR

1) Split Data

set.seed(345)
split = initial_split(a %>% slice_sample(prop = 0.20), strata = hotel) ##!!<NOTE> change this later on to full ds, reducing size for testing purposes
train = rsample::training(split)
test = rsample::testing(split)
vfolds = vfold_cv(train, v = 3)

level count for factor vars

train %>% select(where(is.factor)) %>%
  map_df(n_unique) %>%
  pivot_longer(
    everything(),
    names_to = 'factor',
    values_to = 'unique.levels.count'
    ) %>% arrange(-unique.levels.count)

2) Create recipe

# Documentation: https://www.rdocumentation.org/packages/recipes/versions/0.1.14

#a recipe is used for preprocessing
recipe = train %>% recipe(adr ~ . ) %>%
  #remove vars with low or now correlation
  step_corr(all_numeric(),-all_outcomes()) %>% 
  #remove vars with low or no variance
  step_nzv(all_numeric(),-all_outcomes()) %>% 
  step_zv(all_numeric(),-all_outcomes()) %>%
  #----------------------------
  #reduce number of levels for factors with many, many levels
  step_other(agent, company, country) %>%  #default threshold of 0.05
  #----------------------------
  step_normalize(stays_in_weekend_nights, stays_in_week_nights) %>% #prob not necessary b/c both vars are already scaled the same, but doing so anyway to develop good habits
  step_pca(stays_in_weekend_nights, stays_in_week_nights, num_comp = 1) #will limit to PC1 only
  
tidy(recipe)

check interaction of recipe with vars

(recipe %>% prep())
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         30
## 
## Training data contained 11275 data points and no missing data.
## 
## Operations:
## 
## Correlation filter removed no terms [trained]
## Sparse, unbalanced variable filter removed babies, children, ... [trained]
## Zero variance filter removed no terms [trained]
## Collapsing factor levels for agent, company, country [trained]
## Centering and scaling for stays_in_weekend_nights, stays_in_week_nights [trained]
## PCA extraction with stays_in_weekend_nights, stays_in_week_nights [trained]

3) Preprocess train & test

#'Using the recipe, prepare & bake the train ds'
baked.train = recipe %>% prep() %>% bake(new_data = NULL) %>%
  select(sort(tidyselect::peek_vars())) %>% select(where(is.factor),where(is.numeric))

#'Using the recipe, prepare & bake the test ds'
baked.test = recipe %>% prep() %>% bake(new_data = test) %>%
  select(sort(tidyselect::peek_vars())) %>% select(where(is.factor),where(is.numeric))

baked.train %>% head() %>% DT::datatable()
baked.test %>% head %>% DT::datatable()

### check that ‘step_other’ worked correctly

baked.train %>% select(agent, company, country) %>% map(levels)
## $agent
## [1] "240"   "9"     "NULL"  "other"
## 
## $company
## [1] "NULL"  "other"
## 
## $country
## [1] "DEU"   "ESP"   "FRA"   "GBR"   "PRT"   "other"
baked.train %>% select(agent, company, country) %>% map(fct_count, sort = TRUE)
## $agent
## # A tibble: 4 x 2
##   f         n
##   <fct> <int>
## 1 other  5368
## 2 9      2771
## 3 NULL   1850
## 4 240    1286
## 
## $company
## # A tibble: 2 x 2
##   f         n
##   <fct> <int>
## 1 NULL  10420
## 2 other   855
## 
## $country
## # A tibble: 6 x 2
##   f         n
##   <fct> <int>
## 1 other  3580
## 2 PRT    3197
## 3 GBR    1375
## 4 FRA    1258
## 5 DEU     936
## 6 ESP     929

Intermediate Modeling

reference of var abbreviations

tibble(
  abbreviation = c('rec','mdl','wf','tg'),
  term = c('recipe','model','workflow','tune_grid'),
  description = c('executes necessary preprocessing steps on the features','self explanatory','combines the recipe and model','executes on vfolds using the workflow, and optional hyperparameter grid')
) %>% DT::datatable()

1) Create model Specification w/hps to be tuned

#hyperparameters with a value of 'tune()' are those we will 'tune'
rf.mdl = parsnip::rand_forest(
  trees = 300,
  min_n = tune(), #min number of observations at terminal node
  mtry = tune() #number of vars to randomly subset at each node
) %>% 
  set_mode('regression') %>% 
  set_engine('ranger', importance = 'impurity_corrected')

2) Create workflow Specification

#create workflow (pipeline) combining recipe (preprocessing) and model (w/hps)
rf.wf = workflow() %>% 
  add_recipe(recipe) %>% 
  add_model(rf.mdl)

3) Execute model & workflow on vfold ds using auto-hps Execution

doParallel::registerDoParallel() #use parallel processing
set.seed(345)

rf.tg = tune_grid(
  rf.wf,
  resamples = vfolds,
  grid = 5) #Create a tuning grid AUTOMATICALLY
## i Creating pre-processing data to finalize unknown parameter: mtry
rf.tg
rf.tg %>% collect_metrics()

viz results

ggplotly(
rf.tg %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>%
  select(mean, min_n, mtry) %>%
  pivot_longer(min_n:mtry,
    values_to = "value",
    names_to = "parameter"
  ) %>%
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(show.legend = FALSE, size = 3) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(x = NULL, y = "RMSE")
)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
rf.tg %>%
  collect_metrics() %>% 
  filter(.metric == 'rmse') %>% 
  select(mean, mtry, min_n, .config) %>% 
  arrange(mean)

based on the results above, we can now get an idea of the ranges we should manually set for our hps

4) Create a tuning grid manually

rf.gr = grid_regular(
  min_n(range = c(25, 35)),
  mtry(range = c(15,25)),
  levels = 3
)

rf.gr

5) Execute model & workflow on vfold ds using manual-hps Execution

doParallel::registerDoParallel() #use parallel processing
set.seed(345)

rf.tg = tune_grid(
  rf.wf,
  resamples = vfolds,
  grid = rf.gr) #Create a tuning grid MANUALLY

rf.tg
rf.tg %>% collect_metrics()

viz results

ggplotly(
rf.tg %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>%
  select(mean, min_n, mtry) %>%
  pivot_longer(min_n:mtry,
    values_to = "value",
    names_to = "parameter"
  ) %>%
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(show.legend = FALSE, size = 3) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(x = NULL, y = "RMSE")
)
rf.tg %>%
  collect_metrics() %>% 
  filter(.metric == 'rmse') %>% 
  select(mean, mtry, min_n, .config) %>% 
  arrange(mean)

6) choose best model

(rf.best.hps = select_best(rf.tg, 'rmse'))

7) finalize model

rf.mdl.fin = finalize_model(
  x = rf.mdl, #the model
  parameters = rf.best.hps
)

8) Determine var importance

library(vip)

rf.mdl.fin %>%
  set_engine("ranger", importance = "impurity_corrected") %>%
  fit(adr ~ . ,
    data = baked.train
  ) %>%
  vip::vip(geom = "point")

9) Create final workflow, Execute on test, Evaluate results

#create final workflow
rf.wf.fin = workflow() %>% 
  add_recipe(recipe) %>% 
  add_model(rf.mdl.fin)

#fit on ENTIRE train, execute on test, evaluate results
rf.res.final = rf.wf.fin %>% 
  last_fit(split)

#check results
rf.res.final %>% collect_metrics()

Simple Modeling

doParallel::registerDoParallel() #use parallel processing

#!!<NOTE> to analyze var importance, you need an importance arg
# The 'impurity_corrected' importance measure is unbiased in terms of the number of categories and category frequencies and is almost as fast as the standard impurity importance. 
model.rf = ranger::ranger(adr ~ . , data = baked.train, num.trees = 300, importance = 'impurity_corrected')

model.rf

#viz var importance
model.rf.var.imp = model.rf$variable.importance %>% as.matrix() %>% as.data.frame.matrix() %>% rownames_to_column() %>% rename(var = rowname, imp = V1) %>% arrange(-imp)

model.rf.var.imp %>% ggplot(aes(fct_reorder(var, imp), imp)) + geom_col() + coord_flip()

#make predictions
model.rf.preds = as.vector(predict(model.rf, baked.test, seed = 345)$predictions)

#y, pred
model.rf.df = tibble(y = baked.test$adr, preds = model.rf.preds)

#viz
model.rf.df %>% ggplot(aes(y, preds)) + geom_point() + geom_smooth(method='lm')

#what is rmse and mae?
bind_rows(
  rmse = model.rf.df %>% yardstick::rmse(truth = y, estimate = preds),
  mae = model.rf.df %>% yardstick::mae(truth = y, estimate = preds)
  )
#compare relative performance
#what does mae look like rel. to sd of response var?
paste("test set's standard deviation is: ", round(sd(test$adr),1))

Observations

  1. Due to seasonality, it’s not surprising how date vars ranking highly in var importance
  2. Reserved Room type is also self-explanatory
  3. Ditto for market segment (e.g. Group rates are cheaper than Impromptu ‘Transient’ bookings)
  4. Interestingly, type of hotel (City, Resort) is ~equal in var imp. to type of agent
    • looking at the agent levels above, perhaps having not agent (NULL) or agent 9 makes a decent impact on adr